Start by importing the offense data from Tim’s Excel workbook. To help with visualising the data we need to pivot to narrow format, fix up the dates, and tidy up the columns a bit.
library(dplyr)
library(tidyr)
library(readxl)
library(lubridate)
offence_by_month <-
read_excel("~/Desktop/Tim/RCI_offencebymonth.xlsm") %>%
gather(key = "Date", value = "Count", -LGA, -`Offence category`, -Subcategory) %>%
mutate(Date = as.Date(as.numeric(Date), origin = "1900-01-01")) %>%
mutate(Date = make_date(year(Date), month(Date))) %>% # Making sure dates are 1st of month
rename(Offence = `Offence category`) %>%
select(LGA, Date, Offence, Subcategory, Count)
## Warning in strptime(x, format, tz = tz): unknown timezone 'zone/tz/2017c.
## 1.0/zoneinfo/Australia/Sydney'
We can now plot the number of offences per month over time for the 5 LGAs we’re interested in, plus Wollongong (mainly because it makes the plot look better to have 6 instead of 5).
library(ggplot2)
library(ggthemes)
offence_by_month %>%
filter(LGA %in% c("Sydney", "Blacktown", "Newcastle", "Fairfield", "Mosman", "Wollongong")) %>%
ggplot(aes(x=year(Date), weight=Count, fill=LGA)) +
geom_bar() +
facet_wrap(~LGA) +
xlab("Year") + ylab("Offences") +
theme_tufte(base_size = 14, base_family = "sans") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), legend.position = "none")
Now we need to go and get the police strength (staffing) figures. The datasets are a bit of a mess, with a mix of HTML pages and PDFs, and with a constantly changing web address. I’ve manually worked through all of these issues and kept everything in an Excel sheet to make it a bit easier to manage.
This takes a few minutes to run.
library(rvest)
library(stringr)
library(readr)
library(lubridate)
library(pdftools)
# This is a very messy set of data with a constantly changing format. The script
# below works at the moment, but it's not pretty, and there are still a few
# niggling issues. But it's good enough for now!
controls <- read_excel("~/code/police/scraping_controls.xlsx", na = "NA")
controls <- controls[!is.na(controls$URL),]
monthly_data <- vector(mode = "list", length = nrow(controls))
for (i in 1:nrow(controls)) {
month_name <- format(make_date(month=controls$Month[i]), "%B")
year <- controls$Year[i]
message("Extracting data for ", month_name, " ", year)
if (controls$Format[i] == "html") {
dat <-
html(controls$URL[i]) %>%
html_node("#content-main div") %>%
html_text() %>%
str_replace_all("\r\n\r\n", "|") %>%
str_replace_all("\r\n", "\n") %>%
read_delim(delim = "|")
names(dat) <- c("REGION", "LAC", "AUTH", "ACTUALS")
keep_up_to <- which(str_detect(dat$REGION, "Command"))[1] - 1
# Drop first row and all crud at the end, also drop first column
dat <- dat[2:keep_up_to,-1]
# Drop totals
total_rows <- which(str_detect(dat$LAC, "Total") | str_detect(dat$LAC, "^\\s*$"))
dat <- dat[-total_rows,]
dat$DATE <- make_date(controls$Year[i], controls$Month[i])
dat$LAC <- str_replace(dat$LAC, "^ *", "")
monthly_data[[i]] <- dat
} else if (controls$Format[i] == "pdf") {
dat <- pdf_text(controls$URL[i])[1] %>% str_split("\n")
header_row <- which(str_detect(dat[[1]], "Region.*Auth.*Actual"))
dat <- dat[[1]][-c(1:header_row)]
dat <- dat[-which(dat == "" | str_detect(dat, "Use Only"))]
total_rows <- which(str_detect(dat, "Total"))
dat <- dat[-total_rows]
dat <- str_split(dat, " +")
dat <- as.data.frame(t(data.frame(dat)))
row.names(dat) <- 1:nrow(dat)
names(dat) <- c("REGION", "LAC", "AUTH", "ACTUALS")
dat$DATE <- make_date(controls$Year[i], controls$Month[i])
dat$LAC <- str_replace(dat$LAC, "^ *", "")
monthly_data[[i]] <- dat[,-1]
}
}
police_strength <- do.call("rbind", monthly_data)
police_strength$AUTH <- as.numeric(as.character(police_strength$AUTH))
police_strength$ACTUALS <- as.numeric(as.character(police_strength$ACTUALS))
police_strength <- police_strength[!is.na(police_strength$LAC),]
police_strength[police_strength$LAC == "Central Metropolitan","LAC"] <- "Central Metro"
police_strength[str_detect(police_strength$LAC, "Sydney City"),"LAC"] <- "Sydney City"
With all of the data collected, we now need to filter and process so it’s ready for plotting. This requires a bunch of data cleaning, then a filter and a pivot, followed by the creation of some new columns for the LGA staffing levels (based on the formula given by Tim). We also need to manually insert the value for Fairfield LGA for July 2013, as it somehow got missed and it’s quicker to fix manually than it is to fix the import process.
# Fixing most of the NAs
missing_rows <- which(is.na(police_strength$ACTUALS))
police_strength$ACTUALS[missing_rows] <- 0
library(tidyr)
police_strength_LGA <-
police_strength %>%
# Fixing some data quality issues with LAC names
mutate(LAC = str_replace(LAC, "(Quakers Hill.*)", "Quakers Hill")) %>%
mutate(LAC = str_replace(LAC, "Bktown", "Blacktown")) %>%
# Fixing a name change
mutate(LAC = str_replace(LAC, "City Central", "Sydney City")) %>%
# Filter for just the stations we care about
filter(LAC %in% c("Sydney City", "Central Metro", "City Central",
"Kings Cross", "Redfern", "Leichhardt", "Surry Hills",
"Blacktown", "Quakers Hill", "Mt Druitt", "Newcastle City",
"Fairfield", "Harbourside")) %>%
# Pivot into a wide table
select(-AUTH) %>%
spread(LAC, ACTUALS) %>%
# Calulate LGA staffing using Tim's formula
mutate(SydneyLGA =
`Sydney City` +
`Central Metro` +
`Kings Cross` +
`Redfern` +
`Surry Hills` +
(`Leichhardt`/2),
BlacktownLGA =
`Blacktown` +
`Quakers Hill` +
`Mt Druitt`,
NewcastleLGA =
`Newcastle City`,
FairfieldLGA =
`Fairfield`,
MosmanLGA =
(`Harbourside`/2)) %>%
# Select just the LGAs (remove all of the LACs)
select(DATE, SydneyLGA, BlacktownLGA, NewcastleLGA, FairfieldLGA, MosmanLGA)
# Somehow lost a single value for Fairfield, so inserting it manually
fix_row <- which(police_strength_LGA$DATE == "2013-07-01")
police_strength_LGA[fix_row, "FairfieldLGA"] <- 169
Cool. Now we can plot how each LGA’s staffing changed over time.
police_strength_LGA %>%
# Pivoting back to narrow for plotting
gather("LGA", "Staff", -DATE) %>%
# Making things look nicer
mutate(LGA = str_replace(LGA, "LGA", "")) %>%
# Plotting
ggplot(aes(x = DATE, y = Staff, col = LGA)) +
geom_line() +
labs(x = "Date", y = "Staffing (actual)") +
theme_tufte(base_size = 14, base_family = "sans")
At this point, we have enough data to try some modelling approaches, however the assumptions for these approaches are going to be difficult to determine, so we can stick to visualisation for now, to see if we can identify if there are any clear stories here.
The code below creates a function which will let us plot each of the LGAs easily.
library(gtable)
library(grid)
plot_lga <- function(lga_filter) {
gg1 <-
police_strength_LGA %>%
# Pivoting back to narrow for plotting
gather("LGA", "Staff", -DATE) %>%
# Making things look nicer
mutate(LGA = str_replace(LGA, "LGA", "")) %>%
# Filtering for specified lGA only
filter(LGA == lga_filter) %>%
# Plotting
filter(DATE >= ymd("2012-04-01")) %>%
filter(DATE <= ymd("2015-12-31")) %>%
ggplot(aes(x = DATE, y = Staff)) +
geom_line(col = "blue") +
labs(x = "", y = "Police Strength", subtitle = lga_filter, title = "Police staffing and number of offences per month") +
#theme_tufte(base_size = 14, base_family = "sans") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), legend.position = "none") +
scale_x_date(limits=c(ymd("2012-04-01"), ymd("2015-12-31")), date_breaks = "years", labels = NULL)
gg2 <-
offence_by_month %>%
filter(LGA == lga_filter) %>%
filter(Date >= "2012-04-01") %>%
filter(Date <= "2015-12-31") %>%
group_by(Date, LGA) %>%
summarise(Count = sum(Count)) %>%
ggplot() +
geom_line(aes(x=Date, y=Count), col="red") +
#theme_tufte(base_size = 14, base_family = "sans") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), legend.position = "none") +
scale_x_date(limits=c(ymd("2012-04-01"), ymd("2015-12-31")), date_breaks = "years", date_labels = "%b %Y") +
labs(x = "Date", y = "Offences")
## convert plots to gtable objects
g1 <- ggplotGrob(gg1)
g2 <- ggplotGrob(gg2)
g <- rbind(g1, g2, size="first") # stack the two plots
g$widths <- unit.pmax(g1$widths, g2$widths) # use the largest widths
grid.newpage()
grid.draw(g)
}
Now to generate some plots…
plot_lga("Sydney")
plot_lga("Newcastle")
plot_lga("Fairfield")
plot_lga("Blacktown")
plot_lga("Mosman")
One thing we can do to drill down further on this is to group together similar types of crimes. This is going to be a very plot-heavy affair, so we’re going to generate them programmatically.
groupings <- list(
Homicide = c("Homicide-Murder (a)",
"Homicide-Attempted murder",
"Homicide-Murder accessory, conspiracy",
"Homicide-Manslaughter (a)"),
`Domestic Assault` = "Assault-Domestic violence related assault",
Assault = c("Assault-Non-domestic violence related assault",
"Assault-Assault Police"),
`Sexual Offences` = c("Sexual offences-Sexual assault",
"Sexual offences-Indecent assault, act of indecency and other sexual offences"),
Theft = c("Theft-Break and enter dwelling",
"Theft-Break and enter non-dwelling",
"Theft-Receiving or handling stolen goods",
"Theft-Motor vehicle theft",
"Theft-Steal from motor vehicle",
"Theft-Steal from retail store",
"Theft-Steal from dwelling",
"Theft-Steal from person",
"Theft-Stock theft",
"Theft-Fraud",
"Theft-Other theft"),
Robbery = c("Robbery-Robbery without a weapon",
"Robbery-Robbery with a firearm",
"Robbery-Robbery with a weapon not a firearm"),
`Drug Possession` = c("Drug offences-Possession and/or use of cocaine",
"Drug offences-Possession and/or use of narcotics",
"Drug offences-Possession and/or use of cannabis",
"Drug offences-Possession and/or use of amphetamines",
"Drug offences-Possession and/or use of ecstasy",
"Drug offences-Possession and/or use of other drugs"),
`Drug Dealing` = c("Drug offences-Dealing, trafficking in cocaine",
"Drug offences-Dealing, trafficking in narcotics",
"Drug offences-Dealing, trafficking in cannabis",
"Drug offences-Dealing, trafficking in amphetamines",
"Drug offences-Dealing, trafficking in ecstasy",
"Drug offences-Dealing, trafficking in other drugs"),
`Drug Offences` = c("Drug offences-Possession and/or use of cocaine",
"Drug offences-Possession and/or use of narcotics",
"Drug offences-Possession and/or use of cannabis",
"Drug offences-Possession and/or use of amphetamines",
"Drug offences-Possession and/or use of ecstasy",
"Drug offences-Possession and/or use of other drugs",
"Drug offences-Dealing, trafficking in cocaine",
"Drug offences-Dealing, trafficking in narcotics",
"Drug offences-Dealing, trafficking in cannabis",
"Drug offences-Dealing, trafficking in amphetamines",
"Drug offences-Dealing, trafficking in ecstasy",
"Drug offences-Dealing, trafficking in other drugs",
"Drug offences-Cultivating cannabis",
"Drug offences-Manufacture drug",
"Drug offences-Importing drugs",
"Drug offences-Other drug offences"),
`Disorderly Conduct` = c("Disorderly conduct-Trespass",
"Disorderly conduct-Offensive conduct",
"Disorderly conduct-Offensive language",
"Disorderly conduct-Criminal intent"),
`Against Justice Procedures` = c("Against justice procedures-Escape custody",
"Against justice procedures-Breach Apprehended Violence Order",
"Against justice procedures-Breach bail conditions",
"Against justice procedures-Fail to appear",
"Against justice procedures-Resist or hinder officer")
)
Also need to extend the plotting function from before…
plot_lga_filtered <- function(lga_filter, crime_types, group_label) {
gg1 <-
police_strength_LGA %>%
# Pivoting back to narrow for plotting
gather("LGA", "Staff", -DATE) %>%
# Making things look nicer
mutate(LGA = str_replace(LGA, "LGA", "")) %>%
# Filtering for specified lGA only
filter(LGA == lga_filter) %>%
# Plotting
filter(DATE >= ymd("2012-04-01")) %>%
filter(DATE <= ymd("2015-12-31")) %>%
ggplot(aes(x = DATE, y = Staff)) +
geom_line(col = "blue") +
labs(x = "", y = "Police Strength", subtitle = lga_filter, title = "Police staffing and number of offences per month") +
#theme_tufte(base_size = 14, base_family = "sans") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), legend.position = "none") +
scale_x_date(limits=c(ymd("2012-04-01"), ymd("2015-12-31")), date_breaks = "years", labels = NULL)
gg2 <-
offence_by_month %>%
filter(LGA == lga_filter) %>%
# Filtering for specified crime types only
filter(paste(Offence, Subcategory, sep = "-") %in% crime_types) %>%
filter(Date >= "2012-04-01") %>%
filter(Date <= "2015-12-31") %>%
group_by(Date, LGA) %>%
summarise(Count = sum(Count)) %>%
ggplot() +
geom_line(aes(x=Date, y=Count), col="red") +
#theme_tufte(base_size = 14, base_family = "sans") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), legend.position = "none") +
scale_x_date(limits=c(ymd("2012-04-01"), ymd("2015-12-31")), date_breaks = "years", date_labels = "%b %Y") +
labs(x = "Date", y = "Offences")
## convert plots to gtable objects
g1 <- ggplotGrob(gg1)
g2 <- ggplotGrob(gg2)
g <- rbind(g1, g2, size="first") # stack the two plots
g$widths <- unit.pmax(g1$widths, g2$widths) # use the largest widths
grid.newpage()
grid.draw(g)
}
LGA_list <- c("Sydney", "Blacktown", "Newcastle", "Fairfield", "Mosman")
for (i in LGA_list) {
for (j in 1:length(groupings)) {
plot_lga_filtered(i, groupings[[j]], names(groupings)[j])
print(paste("LGA: ", i, " Crime type: ", names(groupings)[j]))
}
}
## [1] "LGA: Sydney Crime type: Homicide"
## [1] "LGA: Sydney Crime type: Domestic Assault"
## [1] "LGA: Sydney Crime type: Assault"
## [1] "LGA: Sydney Crime type: Sexual Offences"
## [1] "LGA: Sydney Crime type: Theft"
## [1] "LGA: Sydney Crime type: Robbery"
## [1] "LGA: Sydney Crime type: Drug Possession"
## [1] "LGA: Sydney Crime type: Drug Dealing"
## [1] "LGA: Sydney Crime type: Drug Offences"
## [1] "LGA: Sydney Crime type: Disorderly Conduct"
## [1] "LGA: Sydney Crime type: Against Justice Procedures"
## [1] "LGA: Blacktown Crime type: Homicide"
## [1] "LGA: Blacktown Crime type: Domestic Assault"
## [1] "LGA: Blacktown Crime type: Assault"
## [1] "LGA: Blacktown Crime type: Sexual Offences"
## [1] "LGA: Blacktown Crime type: Theft"
## [1] "LGA: Blacktown Crime type: Robbery"
## [1] "LGA: Blacktown Crime type: Drug Possession"
## [1] "LGA: Blacktown Crime type: Drug Dealing"
## [1] "LGA: Blacktown Crime type: Drug Offences"
## [1] "LGA: Blacktown Crime type: Disorderly Conduct"
## [1] "LGA: Blacktown Crime type: Against Justice Procedures"
## [1] "LGA: Newcastle Crime type: Homicide"
## [1] "LGA: Newcastle Crime type: Domestic Assault"
## [1] "LGA: Newcastle Crime type: Assault"
## [1] "LGA: Newcastle Crime type: Sexual Offences"
## [1] "LGA: Newcastle Crime type: Theft"
## [1] "LGA: Newcastle Crime type: Robbery"
## [1] "LGA: Newcastle Crime type: Drug Possession"
## [1] "LGA: Newcastle Crime type: Drug Dealing"
## [1] "LGA: Newcastle Crime type: Drug Offences"
## [1] "LGA: Newcastle Crime type: Disorderly Conduct"
## [1] "LGA: Newcastle Crime type: Against Justice Procedures"
## [1] "LGA: Fairfield Crime type: Homicide"
## [1] "LGA: Fairfield Crime type: Domestic Assault"
## [1] "LGA: Fairfield Crime type: Assault"
## [1] "LGA: Fairfield Crime type: Sexual Offences"
## [1] "LGA: Fairfield Crime type: Theft"
## [1] "LGA: Fairfield Crime type: Robbery"
## [1] "LGA: Fairfield Crime type: Drug Possession"
## [1] "LGA: Fairfield Crime type: Drug Dealing"
## [1] "LGA: Fairfield Crime type: Drug Offences"
## [1] "LGA: Fairfield Crime type: Disorderly Conduct"
## [1] "LGA: Fairfield Crime type: Against Justice Procedures"
## [1] "LGA: Mosman Crime type: Homicide"
## [1] "LGA: Mosman Crime type: Domestic Assault"
## [1] "LGA: Mosman Crime type: Assault"
## [1] "LGA: Mosman Crime type: Sexual Offences"
## [1] "LGA: Mosman Crime type: Theft"
## [1] "LGA: Mosman Crime type: Robbery"
## [1] "LGA: Mosman Crime type: Drug Possession"
## [1] "LGA: Mosman Crime type: Drug Dealing"
## [1] "LGA: Mosman Crime type: Drug Offences"
## [1] "LGA: Mosman Crime type: Disorderly Conduct"
## [1] "LGA: Mosman Crime type: Against Justice Procedures"